STAT 651 : Final Project

1.Load Necessary Libraries

library(dplyr)
Warning: package 'dplyr' was built under R version 4.4.1

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.4.1
library(lubridate)
Warning: package 'lubridate' was built under R version 4.4.1

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
library(plotly)
Warning: package 'plotly' was built under R version 4.4.1

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
library(leaflet)
Warning: package 'leaflet' was built under R version 4.4.1
library(viridis)
Warning: package 'viridis' was built under R version 4.4.1
Loading required package: viridisLite
Warning: package 'viridisLite' was built under R version 4.4.1
library(scales)
Warning: package 'scales' was built under R version 4.4.2

Attaching package: 'scales'
The following object is masked from 'package:viridis':

    viridis_pal
library(stringr)
Warning: package 'stringr' was built under R version 4.4.1
library(tigris)
Warning: package 'tigris' was built under R version 4.4.2
To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.

2. Load dataset

data <- read.csv("us-counties-2023.csv", stringsAsFactors = FALSE)
head(data)
      date  county   state fips cases deaths
1 1/1/2023 Autauga Alabama 1001 18961    230
2 1/1/2023 Baldwin Alabama 1003 67496    719
3 1/1/2023 Barbour Alabama 1005  7027    111
4 1/1/2023    Bibb Alabama 1007  7692    108
5 1/1/2023  Blount Alabama 1009 17731    260
6 1/1/2023 Bullock Alabama 1011  2886     54
str(data)
'data.frame':   267009 obs. of  6 variables:
 $ date  : chr  "1/1/2023" "1/1/2023" "1/1/2023" "1/1/2023" ...
 $ county: chr  "Autauga" "Baldwin" "Barbour" "Bibb" ...
 $ state : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
 $ fips  : int  1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
 $ cases : int  18961 67496 7027 7692 17731 2886 6185 39458 10311 6456 ...
 $ deaths: int  230 719 111 108 260 54 130 665 174 133 ...

3. Bar Plots

data_1 <- data %>%
  group_by(state) %>%
  summarize(
    total_cases = sum(cases, na.rm = TRUE),
    total_deaths = sum(deaths, na.rm = TRUE)
  ) %>%
  mutate(sep = case_when(
    total_cases >= total_deaths ~ "High Cases",
    TRUE ~ "High Deaths"
  ))

# Top 5 states by cases
top_cases <- data_1 %>%
  arrange(desc(total_cases)) %>%
  slice(1:5)

# Top 5 states by deaths
top_deaths <- data_1 %>%
  arrange(desc(total_deaths)) %>%
  slice(1:5)

# Bar plot for top 5 states by cases
ggplot(top_cases, aes(x = reorder(state, -total_cases), y = total_cases, fill = sep)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = label_comma()) +  # Format y-axis with commas
  labs(
    title = "Top 5 States by Cases",
    x = "State",
    y = "Total Cases",
    fill = "Category"
  ) +
  theme_minimal()

# Bar plot for top 5 states by deaths
ggplot(top_deaths, aes(x = reorder(state, -total_deaths), y = total_deaths, fill = sep)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = label_comma()) +  # Format y-axis with commas
  labs(
    title = "Top 5 States by Deaths",
    x = "State",
    y = "Total Deaths",
    fill = "Category"
  ) +
  theme_minimal()

4. Facet Plots

# Parse date, extract day, and summarize data
data_2 <- data %>%
  mutate(
    date = mdy(date),  # Parse date
    day = day(date),  # Extract day of the month
    month_label = month(date, label = TRUE, abbr = FALSE)  # Extract full month names
  ) %>%
  group_by(state, month_label, day) %>%  # Group by state, month, and day
  summarize(
    total_cases = sum(cases, na.rm = TRUE),
    total_deaths = sum(deaths, na.rm = TRUE),
    .groups = "drop"  
  )

selected_states <- c("California", "Texas", "Florida")
data_filtered <- data_2 %>%
  filter(state %in% selected_states)

# Plot for daily deaths
facet_plot_deaths <- ggplot(data_filtered, aes(x = day, y = total_deaths, color = state)) +
  geom_point(size = 2, alpha = 0.8) +
  facet_wrap(~month_label, scales = "free_y") + 
  scale_y_continuous(labels = label_comma()) +  
  labs(
    title = "Daily Deaths by State",
    x = "Day of Month",
    y = "Total Deaths",
    color = "State"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text = element_text(size = 10)  
  )

# Plot for daily cases
facet_plot_cases <- ggplot(data_filtered, aes(x = day, y = total_cases, color = state)) +
  geom_point(size = 2, alpha = 0.8) +
  facet_wrap(~month_label, scales = "free_y") +  
  scale_y_continuous(labels = label_comma()) + 
  labs(
    title = "Daily Cases by State",
    x = "Day of Month",
    y = "Total Cases",
    color = "State"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    strip.text = element_text(size = 10) 
  )

# Print the plots
facet_plot_deaths

facet_plot_cases

5. Interactive Line plot

# Ensure date is parsed correctly and remove rows with missing values
data <- data %>%
  mutate(date = mdy(date)) %>%  # Convert to Date format
  filter(!is.na(date) & !is.na(deaths))  # Remove rows with missing values

# Summarize total deaths by county and date
county_time_series <- data %>%
  group_by(county, date) %>%
  summarise(total_deaths = sum(deaths, na.rm = TRUE), .groups = "drop")

# Identify the top 3 counties by total deaths
top_counties <- data %>%
  group_by(county) %>%
  summarise(total_deaths = sum(deaths, na.rm = TRUE)) %>%
  arrange(desc(total_deaths)) %>%
  slice(1:5) %>%
  pull(county)

# Filter data for the top 3 counties
county_time_series_filtered <- county_time_series %>%
  filter(county %in% top_counties)

# Plot using ggplot and format y-axis with commas
county_time_series_plot <- ggplot(county_time_series_filtered, aes(x = date, y = total_deaths, color = county)) +
  geom_line(size = 1) +
  scale_y_continuous(labels = label_comma()) +
  labs(
    title = "COVID-19 Deaths Over Time by Top 5 Counties",
    x = "Date",
    y = "Total Deaths",
    color = "County"
  ) +
  theme_minimal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
# Convert ggplot to Plotly for interactivity
ggplotly(county_time_series_plot)

6. Interactive Geospatial Map

# Summarize data by state
state_data <- data %>%
  group_by(state) %>%
  summarize(total_cases = sum(cases, na.rm = TRUE))

# Get map data for US states
us_states <- maps::map("state", fill = TRUE, plot = FALSE)

# Join map data with state cases
state_data <- state_data %>%
  mutate(region = tolower(state))  # Ensure lowercase for matching

# Create a named vector of total cases for mapping
state_cases <- setNames(state_data$total_cases, state_data$region)

# Create leaflet map with state polygons
leaflet(data = us_states) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~colorNumeric("YlOrRd", domain = state_cases)(state_cases[us_states$names]),
    weight = 1,
    color = "white",
    fillOpacity = 0.7,
    popup = ~paste0(
      "State: ", us_states$names, "<br>",
      "Total Cases: ", state_cases[us_states$names]
    )
  ) %>%
  addLegend(
    pal = colorNumeric("YlOrRd", domain = state_cases),
    values = state_cases,
    title = "Total Cases",
    position = "bottomright"
  )

7.Choropleth Map(Albers projection)

# Summarize data to get total deaths by state
state_deaths_2023 <- data %>%
  group_by(state) %>%
  summarise(total_deaths = sum(deaths, na.rm = TRUE))

# Ensure state names are lowercase for mapping
state_deaths_2023 <- state_deaths_2023 %>%
  mutate(state = str_to_lower(state))

# Load state shape data
states <- map_data("state")

# Rename region column in map data for consistency
states <- states %>%
  rename(state = region)

# Join state shape data with COVID-19 death data
states_joined <- left_join(states, state_deaths_2023, by = "state")

# Plot the Choropleth map
ggplot(data = states_joined, aes(x = long, y = lat, group = group)) +
  geom_polygon(aes(fill = total_deaths / 1000), color = "black") + 
  coord_map(projection = "albers", lat0 = 39, lat1 = 45) +  # Use Albers projection
  scale_fill_viridis(option = "magma", direction = -1) +  # Change color scheme for contrast
  theme_minimal() +
  labs(
    title = "Total COVID-19 Deaths by State (in thousands), 2023",
    fill = "Deaths (000s)"
  )

8.Choropleth Map

# Filter data for California
california_cases <- data %>%
  filter(state == "California") %>%
  group_by(county) %>%
  summarise(total_cases = sum(cases, na.rm = TRUE)) %>%
  mutate(county = str_to_lower(county))  # Ensure county names are lowercase

# Load California map data
ca <- map_data("county", "california") %>%
  rename(county = subregion)  # Rename subregion to county for consistency

# Join map data with California COVID-19 data
ca_joined <- left_join(ca, california_cases, by = "county")

# Plot cases by county in California
ggplot(data = ca_joined, aes(x = long, y = lat, group = group)) +
  geom_polygon(aes(fill = total_cases), color = "black") +
  coord_quickmap() +
  scale_fill_viridis(option = "magma", direction = -1, na.value = "gray90") +  # Use Viridis color scale
  theme_minimal() +
  labs(
    title = "Total COVID-19 Cases by County in California (2023)",
    fill = "Cases"
  )